{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from konlpy.tag import Twitter\n",
    "import tensorflow as tf\n",
    "import math\n",
    "from sklearn import datasets\n",
    "from sklearn.preprocessing import MultiLabelBinarizer\n",
    "from sklearn.decomposition import PCA\n",
    "from scipy.sparse import csr_matrix\n",
    "from scipy.sparse import issparse\n",
    "pos_tagger = Twitter()\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "MODEL_DIRECTORY = \"../\"\n",
    "\n",
    "patent_data_all_file_name = \"patent_data_all.pkl\"\n",
    "patent_data_all_dir = os.path.join(MODEL_DIRECTORY, patent_data_all_file_name)\n",
    "\n",
    "STOP_WORD_KOREAN = \"stop_word_korea.txt\"\n",
    "STOP_WORD_KOREAN_DIR = os.path.join(MODEL_DIRECTORY, STOP_WORD_KOREAN)\n",
    "\n",
    "SAMPLING_PATENT_DATA_file_name = \"sampling_patent_data.pkl\"\n",
    "SAMPLING_PATENT_DATA_dir = os.path.join(MODEL_DIRECTORY, SAMPLING_PATENT_DATA_file_name)\n",
    "\n",
    "SAMPLING_TAGGING_DATA = \"tagging_data_all.pkl\"\n",
    "SAMPLING_TAGGING_DATA_DIR = os.path.join(MODEL_DIRECTORY, SAMPLING_TAGGING_DATA)\n",
    "\n",
    "SAMPLING_PATENT_TF_IDF_NP = \"sampling_patent_tf_idf.npy\"\n",
    "SAMPLING_PATENT_TF_IDF_NP_DIR = os.path.join(MODEL_DIRECTORY, SAMPLING_PATENT_TF_IDF_NP)\n",
    "\n",
    "SAMPLING_TF_IDF_VECTORIZER = \"sampling_tf_idf_vectorizer.pkl\"\n",
    "SAMPLING_TF_IDF_VECTORIZER_DIR = os.path.join(MODEL_DIRECTORY, SAMPLING_TF_IDF_VECTORIZER)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pickle\n",
    "\n",
    "df_patent_data_all = pd.read_pickle(SAMPLING_PATENT_DATA_dir)\n",
    "patet_tf_idf = np.load(SAMPLING_PATENT_TF_IDF_NP_DIR)\n",
    "\n",
    "patent_vector = patet_tf_idf\n",
    "df_patent_data_all = df_patent_data_all.ix[:len(patent_vector), :]\n",
    "\n",
    "len(patent_vector), len(df_patent_data_all)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "target_row = [key for key, value in enumerate(df_patent_data_all[\"ipc_4digit\"].notnull().tolist()) if value == False]\n",
    "\n",
    "df_patent_data_all = df_patent_data_all[df_patent_data_all[\"ipc_4digit\"].notnull()]\n",
    "patent_vector = np.delete(patent_vector, target_row, 0)\n",
    "\n",
    "len(target_row), len(patent_vector), len(df_patent_data_all)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "target_row = [key for key, value in enumerate(df_patent_data_all[\"ipc_4digit\"].str.startswith(\"4\").tolist()) if value == True]\n",
    "\n",
    "df_patent_data_all = df_patent_data_all[ ~(df_patent_data_all[\"ipc_4digit\"].str.startswith(\"4\")) ]\n",
    "patent_vector = np.delete(patent_vector, target_row, 0)\n",
    "\n",
    "len(patent_vector), len(df_patent_data_all)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "temp_list = \",\".join([str(value).upper()[0] for value in list(df_patent_data_all[\"ipc_4digit\"].values)])\n",
    "ipc_1digit_set = set([ipc.strip() for ipc in temp_list.split(\",\")])\n",
    "ipc_1digit_set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "len(ipc_1digit_set)\n",
    "\n",
    "ipc_1digit_dict = {}\n",
    "for i, value in enumerate(\"ABCDEFGH\"):\n",
    "    ipc_1digit_dict[value] = i\n",
    "\n",
    "ipc_1digit_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_ipc_1digit = np.zeros( (len(patent_vector), len(ipc_1digit_set)) , dtype=np.int)\n",
    "\n",
    "y_ipc_1digit.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "for i, value in enumerate(df_patent_data_all[\"ipc_4digit\"].values):\n",
    "    ipc_list = [ipc_1digit_dict[ipc.strip()[0].upper()] for ipc in str(value).split(\",\")]\n",
    "    y_ipc_1digit[i][ipc_list] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    patent_vector, y_ipc_1digit, test_size=0.60, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "del y_ipc_1digit \n",
    "del patent_vector\n",
    "del df_patent_data_all"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def make_index(y):\n",
    "    one_result = []\n",
    "    zero_result = []\n",
    "    len_of_x_data = len(y)\n",
    "    for i in y:\n",
    "        one_result.append(np.where(i ==1.0)[0])\n",
    "        zero_result.append(np.where(i==0.0)[0])\n",
    "    return [one_result,zero_result]\n",
    "\n",
    "def select_index_data_of_output(output,index_list):\n",
    "    data_value_list = []\n",
    "    for i in range(66248):\n",
    "        data_value_list.append([])\n",
    "        cnt = 0\n",
    "        data_value_list[i].append([])\n",
    "        for k in range(len(index_list[0][i])): \n",
    "            data_value_list[i][cnt].append(output[i][index_list[0][i][k]])\n",
    "        cnt +=1\n",
    "        data_value_list[i].append([])\n",
    "        for j in range(len(index_list[1][i])):\n",
    "            data_value_list[i][cnt].append(output[i][index_list[1][i][j]])\n",
    "    return data_value_list\n",
    "\n",
    "        \n",
    "def make_cartesian_list(x):\n",
    "    cartesian_list = []\n",
    "    yi_length = []\n",
    "    nyi_length = []\n",
    "    for i in range(len(x)):\n",
    "        yi_length.append(len(x[i][0]))\n",
    "        nyi_length.append(len(x[i][1]))\n",
    "        cartesian_list.append([])\n",
    "        for j in x[i][0]:\n",
    "            for k in x[i][1]:\n",
    "                cartesian_list[i].append([j,k])\n",
    "    return cartesian_list,yi_length,nyi_length\n",
    "\n",
    "def bp_mll_exp_function(cartesian_list_instance):\n",
    "    return tf.exp(cartesian_list_instance[1]-cartesian_list_instance[0])\n",
    "\n",
    "def get_error(cartesian_list):\n",
    "    cnt = 0\n",
    "    final_global_error = 0\n",
    "    global_error = 0\n",
    "\n",
    "    for docu in cartesian_list[0]:\n",
    "\n",
    "        for instance in docu:\n",
    "            global_error +=  bp_mll_exp_function(instance)\n",
    "        global_error  = global_error * (1/(cartesian_list[1][cnt]*cartesian_list[2][cnt]))\n",
    "        final_global_error += global_error\n",
    "\n",
    "        cnt +=1\n",
    "    return final_global_error"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class RankResults:\n",
    "    def __init__(self):\n",
    "        self.predictedLabels = []\n",
    "        self.topRankedLabels = []\n",
    "        self.outputs = []\n",
    "\n",
    "    def add(self, predict_set, top_label, output):\n",
    "        self.predictedLabels.append(predict_set)\n",
    "        self.topRankedLabels.append(top_label)\n",
    "        self.outputs.append(output)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X = tf.placeholder(\"float\", [None,19243] )\n",
    "W = tf.Variable(tf.random_uniform([19243,100],-0.5,0.5))\n",
    "W2 = tf.Variable(tf.random_uniform([100,8],-0.5,0.5))\n",
    "B1 = tf.Variable(tf.zeros([100]))\n",
    "B2 = tf.Variable(tf.zeros([8]))\n",
    "# Construct model\n",
    "First_hidden = tf.tanh(tf.matmul(X,W)+B1)\n",
    "print(\"1\")\n",
    "hypothesis = tf.tanh((tf.matmul(First_hidden,W2)+B2))\n",
    "print(\"2\")\n",
    "y_data_index = make_index(y_train)\n",
    "print(\"3\")\n",
    "x_data_index_value = select_index_data_of_output(hypothesis,y_data_index)\n",
    "print(\"4\")\n",
    "cartesian_list= make_cartesian_list(x_data_index_value)\n",
    "print(\"5\")\n",
    "cost=get_error(cartesian_list)\n",
    "\n",
    "a = tf.Variable(0.2)\n",
    "optimizer = tf.train.GradientDescentOptimizer(a)\n",
    "train = optimizer.minimize(cost)\n",
    "\n",
    "init = tf.initialize_all_variables()\n",
    "\n",
    "with tf.Session() as sess:\n",
    "    sess.run(init)\n",
    "    for step in range(500):\n",
    "        sess.run(train,feed_dict={X:train_data_trans})\n",
    "        print(sess.run(hypothesis,feed_dict={X:train_data_trans}))\n",
    "        print(sess.run(cost,feed_dict={X:train_data_trans}))\n",
    "    a = sess.run(hypothesis,feed_dict={X:train_data_trans})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda env:ml_python]",
   "language": "python",
   "name": "conda-env-ml_python-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
